1

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(insuranceData)
library(plotly)
## Warning: Paket 'plotly' wurde unter R Version 4.2.2 erstellt
## 
## Attache Paket: 'plotly'
## 
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     last_plot
## 
## Das folgende Objekt ist maskiert 'package:stats':
## 
##     filter
## 
## Das folgende Objekt ist maskiert 'package:graphics':
## 
##     layout
library(scales)
## 
## Attache Paket: 'scales'
## 
## Das folgende Objekt ist maskiert 'package:purrr':
## 
##     discard
## 
## Das folgende Objekt ist maskiert 'package:readr':
## 
##     col_factor
data(dataCar)
a.  Draw barplots of the discrete variables "numclaims", "agecat" (categorized driver age), and "gender".
ggplot(data = dataCar, mapping = aes(x = numclaims)) +
  geom_bar(fill = "#83b692")

ggplot(data = dataCar, mapping = aes(x = agecat)) +
  geom_bar(fill = "#F9ADA0")

ggplot(data = dataCar, mapping = aes(x = gender)) +
  geom_bar(fill = "#5B3758")

b.  Draw a histogram of the vehicle value "veh_value" (in 10'000 Australian Dollars). Truncate values above 7 (this means: if a value is larger than 7, set it to 7).
# use pmin instead of rowwise + min (crazy inefficient)
d <- dataCar %>% 
  rowwise %>% 
  mutate(trunc_veh_value = min(veh_value, 7))

p <- ggplot(d, aes(x = trunc_veh_value)) +
  geom_histogram(fill = "#83b692", bins = 30) +
  ggtitle("Histogram of vehicle value") +
  xlab("Price in 10'000 AUD")
p

c.  Calculate the average number of claims per level of "agecat" and visualize the result as a scatterplot. Interpret the result.
d <- dataCar %>% 
  group_by(agecat) %>% 
  summarise(avg_claims = mean(numclaims))

p <- ggplot(d, mapping = aes(x = agecat, y = avg_claims)) +
  geom_point(color = "black") +
  xlab("Age category, 1=youngest") +
  ggtitle("Scatterplot")
p

Interpretation: The higher the age category, the lower the average number of claims. Especially, age categories 5 and 6 have a lower average than the other age categories and age category has a higher average than the others.

d.  Bin "veh_value" into quartiles and analyze its association with the number of claims as in 1c.
d <- dataCar %>%
  mutate(quartile_rank = ntile(veh_value,4)) %>% 
  group_by(quartile_rank) %>% 
  summarise(avg_claims = mean(numclaims))

p <- ggplot(d, mapping = aes(x = quartile_rank, y = avg_claims)) +
  geom_point(color = "black") +
  xlab("Quartile rank, 1=0.25 quartile") +
  ggtitle("Scatterplot")
p

Interpretation: The higher the value of the vehicle, the higher the average number of claims.

e.  Use the "plotly" package to turn the plot from d. interactive.
p %>% 
  ggplotly()

4. Sieve of Eratosthenes

The sieve of Eratosthenes is an ancient algorithm to get all prime numbers up to any given limit \(n\), see Wikipedia. Write a function sieve_of_eratosthenes(n) that returns all prime numbers up to \(n\). Benchmark the results for \(n = 10^5\) with the package “microbenchmark”. Mind your coding style!

Naive approach:

sieve_of_eratosthenes <- function(n) {
  df <- data.frame(seq(2,n))
  colnames(df) <- c("x")
  
  p <- 2
  i <- 1
  done <- FALSE
  while(!done) {
    df <- df %>% filter(x %% p != 0 | x == p)
    
    i <- i + 1
    if (i > nrow(df)) {
      done = TRUE
    } else {
      p <- df$x[i]
    }
  }
  df
}
head(sieve_of_eratosthenes(1000))
##    x
## 1  2
## 2  3
## 3  5
## 4  7
## 5 11
## 6 13
sieve_of_eratosthenes_opt <- function(n) {
  v <- seq(2,n)
  
  p <- 2
  i <- 1
  done <- FALSE
  while(!done) {
    v <- v[v %% p != 0 | v == p]
    
    i <- i + 1
    if (i > length(v)) {
      done = TRUE
    } else {
      p <- v[i]
    }
  }
  v
}

head(sieve_of_eratosthenes_opt(1000))
## [1]  2  3  5  7 11 13

Comparison

library(microbenchmark)
## Warning: Paket 'microbenchmark' wurde unter R Version 4.2.2 erstellt
n = 10000
microbenchmark(
  sieve_of_eratosthenes(n),
  sieve_of_eratosthenes_opt(n),
  times = 2
)
## Unit: milliseconds
##                          expr       min        lq      mean    median        uq
##      sieve_of_eratosthenes(n) 8626.1765 8626.1765 9027.6796 9027.6796 9429.1826
##  sieve_of_eratosthenes_opt(n)   36.2805   36.2805   38.2185   38.2185   40.1565
##        max neval cld
##  9429.1826     2  a 
##    40.1565     2   b

4

In Exercise 1c, we have calculated and plotted the average number of claims per level of “agecat” in the dataCar data.

a.  Write a function `avg_claim_counts(v)` that provides such a visualization for any discrete variable `v`.

Source: https://shixiangwang.github.io/tidyeval-chinese/dplyr.html

avg_claim_counts <- function(v) {
  v <- enquo(v)
  
  d <- dataCar %>% 
  group_by(!!v) %>% 
  summarise(avg_claims = mean(numclaims))

  p <- ggplot(d, mapping = aes(x = !!v, y = avg_claims)) +
    geom_point(color = "black") +
    xlab(v) +
    ggtitle("Scatterplot")
  p
}

avg_claim_counts(agecat)

avg_claim_counts(gender)

avg_claim_counts(area)

b.  Extend this function with a second argument `interactive` to control whether the resulting plot is interactive or not.
library(plotly)

avg_claim_counts_2 <- function(v, interactive = FALSE) {
  v <- enquo(v)
  
  d <- dataCar %>% 
  group_by(!!v) %>% 
  summarise(avg_claims = mean(numclaims))

  p <- ggplot(d, mapping = aes(x = !!v, y = avg_claims)) +
    geom_point(color = "black") +
    ggtitle("Scatterplot")
  if (interactive) {
    p <- p %>% 
      ggplotly()
  }
  p
}

avg_claim_counts_2(agecat, TRUE)
avg_claim_counts_2(gender, TRUE)
avg_claim_counts_2(area, TRUE)

5

Extend the “student” class from Section “plot, print, summary” by the optional information “semester”. It represents the number of semesters the student is already registered. Add a summary() method that would neatly print the name and the semester of the student.

# Function that creates an object of class "student"
student <- function(given_name, family_name, semester = NULL) {
  out <- list(
    given_name = given_name,
    family_name = family_name,
    semester = semester
  )
  class(out) <- "student"
  out
}

me <- student("Roland", "Widmer")
me2 <- student("Roland", "Widmer", 11)

summary.student <- function(x, ...) {
  cat(x$given_name, x$family_name, ifelse(x$semester, paste(", Semester:", x$semester), ""))
  invisible(x) # do not output x itself
}

summary(me)
## Roland Widmer
summary(me2)
## Roland Widmer , Semester: 11